"""
Phase 4: Panel Regressions Within the Zone
Development Threshold Paper

PanelGLS regressions restricted to country-years in the $9k-$25k zone.
Tests whether demographic effects operate differently within the threshold.
"""

import pandas as pd
import numpy as np
import sys
sys.path.insert(0, '/mnt/c/demographics_capital_flows/multilateral/src')
from model import PanelGLS

# ═══════════════════════════════════════════════════════════════════════
# Load data
# ═══════════════════════════════════════════════════════════════════════
full = pd.read_csv('/mnt/c/demographics_capital_flows/multilateral/140_country/data/processed/full_panel.csv')
panel = full[full['year'] <= 2024].copy()

try:
    rents = pd.read_csv('/mnt/c/demographics_capital_flows/multilateral/140_country/data/raw/wdi_resource_rents.csv')
    panel = panel.merge(rents[['iso3', 'year', 'resource_rents_gdp']], on=['iso3', 'year'], how='left')
except:
    panel['resource_rents_gdp'] = np.nan

LOWER = 9000
UPPER = 25000

# Zone indicator
panel['in_zone'] = ((panel['gdp_pc_ppp'] >= LOWER) & (panel['gdp_pc_ppp'] <= UPPER)).astype(float)
panel['above_zone'] = (panel['gdp_pc_ppp'] > UPPER).astype(float)
panel['Z_1_x_zone'] = panel['Z_1'] * panel['in_zone']
panel['Z_1_x_above'] = panel['Z_1'] * panel['above_zone']

# Standard controls
controls = ['fiscal_bal_gdp', 'nfa_gdp_lag', 'rgdp_growth', 'log_rel_opw', 'kaopen']

def run_gls(df, y_var, x_vars, label):
    """Run PanelGLS and print results."""
    subset = df[['iso3', 'year', y_var] + x_vars].dropna()
    if len(subset) < 50:
        print(f"  {label}: insufficient obs ({len(subset)})")
        return None

    gls = PanelGLS()
    gls.fit(subset[y_var], subset[x_vars], subset['iso3'], subset['year'])

    print(f"\n  {label}")
    print(f"  N={gls.n_obs}, R²={gls.r_squared:.4f}")
    results = []
    for i, var in enumerate(x_vars):
        sig = '***' if gls.pvalues[i] < 0.01 else '**' if gls.pvalues[i] < 0.05 else '*' if gls.pvalues[i] < 0.1 else ''
        print(f"    {var:30s}  β={gls.beta[i]:8.3f}  se={gls.se[i]:7.3f}  p={gls.pvalues[i]:.4f}{sig}")
        results.append({
            'variable': var, 'beta': gls.beta[i], 'se': gls.se[i], 'pvalue': gls.pvalues[i]
        })
    return {'label': label, 'n': gls.n_obs, 'r2': gls.r_squared, 'results': results}

# ═══════════════════════════════════════════════════════════════════════
# 4a. CA/GDP regressions: full sample vs zone only vs interaction
# ═══════════════════════════════════════════════════════════════════════
print("=" * 70)
print("4a. CA/GDP: FULL SAMPLE vs IN-ZONE vs INTERACTION")
print("=" * 70)

# Full sample baseline
r1 = run_gls(panel, 'ca_gdp', ['Z_1'] + controls, 'Full sample: CA = f(Z₁)')

# Zone only
zone_panel = panel[panel['in_zone'] == 1].copy()
r2 = run_gls(zone_panel, 'ca_gdp', ['Z_1'] + controls, 'Zone only ($9k-$25k): CA = f(Z₁)')

# Below zone only
below_panel = panel[panel['gdp_pc_ppp'] < LOWER].copy()
r3 = run_gls(below_panel, 'ca_gdp', ['Z_1'] + controls, 'Below zone (<$9k): CA = f(Z₁)')

# Above zone only
above_panel = panel[panel['gdp_pc_ppp'] > UPPER].copy()
r4 = run_gls(above_panel, 'ca_gdp', ['Z_1'] + controls, 'Above zone (>$25k): CA = f(Z₁)')

# Interaction model
r5 = run_gls(panel, 'ca_gdp', ['Z_1', 'in_zone', 'Z_1_x_zone', 'above_zone', 'Z_1_x_above'] + controls,
             'Full sample with zone interactions')

# ═══════════════════════════════════════════════════════════════════════
# 4b. Growth determinants in the zone
# ═══════════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("4b. GROWTH DETERMINANTS IN THE ZONE")
print("=" * 70)

growth_controls = ['fiscal_bal_gdp', 'nfa_gdp_lag', 'kaopen', 'log_rel_opw']

r6 = run_gls(zone_panel, 'rgdp_growth', ['Z_1'] + growth_controls,
             'Zone: Growth = f(Z₁)')

# Add savings and investment
if 'gross_savings_gdp' in zone_panel.columns:
    r7 = run_gls(zone_panel, 'rgdp_growth', ['Z_1', 'gross_savings_gdp', 'gross_investment_gdp'] + growth_controls,
                 'Zone: Growth = f(Z₁, savings, investment)')

# Add resource rents
r8 = run_gls(zone_panel, 'rgdp_growth', ['Z_1', 'resource_rents_gdp'] + growth_controls,
             'Zone: Growth = f(Z₁, resources)')

# ═══════════════════════════════════════════════════════════════════════
# 4c. Savings behavior in zone
# ═══════════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("4c. SAVINGS IN THE ZONE")
print("=" * 70)

if 'gross_savings_gdp' in panel.columns:
    sav_controls = ['fiscal_bal_gdp', 'nfa_gdp_lag', 'rgdp_growth', 'kaopen']

    r9 = run_gls(panel, 'gross_savings_gdp', ['Z_1'] + sav_controls,
                 'Full sample: Savings = f(Z₁)')

    r10 = run_gls(zone_panel, 'gross_savings_gdp', ['Z_1'] + sav_controls,
                  'Zone only: Savings = f(Z₁)')

    # Is the savings-Z₁ relationship different in the zone?
    r11 = run_gls(panel, 'gross_savings_gdp', ['Z_1', 'in_zone', 'Z_1_x_zone'] + sav_controls,
                  'Savings with zone interaction')

# ═══════════════════════════════════════════════════════════════════════
# 4d. Investment behavior in zone
# ═══════════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("4d. INVESTMENT IN THE ZONE")
print("=" * 70)

if 'gross_investment_gdp' in panel.columns:
    inv_controls = ['fiscal_bal_gdp', 'nfa_gdp_lag', 'rgdp_growth', 'kaopen']

    r12 = run_gls(panel, 'gross_investment_gdp', ['Z_1'] + inv_controls,
                  'Full sample: Investment = f(Z₁)')

    r13 = run_gls(zone_panel, 'gross_investment_gdp', ['Z_1'] + inv_controls,
                  'Zone only: Investment = f(Z₁)')

    # S-I gap in zone
    zone_panel_si = zone_panel.copy()
    zone_panel_si['si_gap'] = zone_panel_si['gross_savings_gdp'] - zone_panel_si['gross_investment_gdp']
    r14 = run_gls(zone_panel_si, 'si_gap', ['Z_1'] + inv_controls,
                  'Zone: S-I gap = f(Z₁)')

# ═══════════════════════════════════════════════════════════════════════
# 4e. Capital account opening during transit
# ═══════════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("4e. CAPITAL ACCOUNT OPENING DURING TRANSIT")
print("=" * 70)

# Event study: KAOPEN change from 5 years before zone entry to 5 years after
cross_df = pd.read_csv('/mnt/c/demographics_capital_flows/development_threshold/data/crossing_data.csv')
zone_entrants = cross_df[cross_df['cross_9k'].notna() & (cross_df['status'] != 'Always above')]

event_data = []
for _, row in zone_entrants.iterrows():
    iso = row['iso3']
    entry = row['cross_9k']
    c = panel[(panel['iso3'] == iso) & (panel['kaopen'].notna())]
    for t in range(-10, 21):
        yr = entry + t
        obs = c[c['year'] == yr]
        if len(obs) > 0:
            event_data.append({
                'iso3': iso, 'event_time': t, 'kaopen': obs['kaopen'].values[0],
                'crossed': row['exited_above']
            })

event_df = pd.DataFrame(event_data)

# Average KAOPEN by event time, split by crossers/non-crossers
print(f"\nKAOPEN event study (relative to zone entry):")
print(f"{'Event time':>10s} {'Crossers':>10s} {'Non-cross':>10s} {'Diff':>8s}")
print("-" * 45)
for t in [-10, -5, 0, 5, 10, 15, 20]:
    c_kao = event_df[(event_df['event_time'] == t) & (event_df['crossed'] == 1)]['kaopen']
    nc_kao = event_df[(event_df['event_time'] == t) & (event_df['crossed'] == 0)]['kaopen']
    if len(c_kao) > 0 and len(nc_kao) > 0:
        print(f"  t={t:+3d}       {c_kao.mean():10.2f} {nc_kao.mean():10.2f} {c_kao.mean()-nc_kao.mean():+8.2f}")

# ═══════════════════════════════════════════════════════════════════════
# 4f. Nonlinear: is there "escape velocity" within the zone?
# ═══════════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("4f. ESCAPE VELOCITY: NONLINEAR EFFECTS WITHIN ZONE")
print("=" * 70)

# Quadratic GDP within zone
zone_panel['gdp_k'] = zone_panel['gdp_pc_ppp'] / 1000  # in thousands for readability
zone_panel['gdp_k_sq'] = zone_panel['gdp_k'] ** 2

r15 = run_gls(zone_panel, 'rgdp_growth', ['Z_1', 'gdp_k', 'gdp_k_sq'] + growth_controls,
              'Zone: Growth = f(Z₁, GDP, GDP²)')

# Split zone into sub-bands
for band_label, lo, hi in [('Lower zone $9k-$15k', 9000, 15000),
                             ('Mid zone $15k-$20k', 15000, 20000),
                             ('Upper zone $20k-$25k', 20000, 25000)]:
    sub = panel[(panel['gdp_pc_ppp'] >= lo) & (panel['gdp_pc_ppp'] < hi)]
    r = run_gls(sub, 'ca_gdp', ['Z_1'] + controls, f'{band_label}: CA = f(Z₁)')

# ═══════════════════════════════════════════════════════════════════════
# Save results table
# ═══════════════════════════════════════════════════════════════════════
all_results = []
for r in [r1, r2, r3, r4, r5, r6, r8, r9, r10, r11, r12, r13]:
    if r is not None:
        for res in r['results']:
            all_results.append({
                'model': r['label'], 'n': r['n'], 'r2': r['r2'],
                'variable': res['variable'], 'beta': res['beta'],
                'se': res['se'], 'pvalue': res['pvalue']
            })

pd.DataFrame(all_results).to_csv(
    '/mnt/c/demographics_capital_flows/development_threshold/output/tables/table11_panel_zone.csv', index=False)

print("\n✓ Phase 4 complete.")
